from IPython.core.display import display, HTML
display(HTML("""<style> .container {width:96% !important;}</style>"""))
from IPython.display import IFrame
import sys
sys.path.insert(0,'../')
from utils.paths import *
import pandas as pd
import numpy as np
# from plotly.offline import init_notebook_mode, iplot
# import cufflinks as cf
# init_notebook_mode()
# cf.go_offline()
from __future__ import division
def table(no):
# there are 13 additional table
return pd.read_csv(path_SBA + 't00%02d'%no + '-10.1080%2F10691898.2018.1434342.csv')
nat = pd.read_csv(path_SBA + 'SBAnational.csv', low_memory=False)
# Preprocessing
import preprocessing as pp
reload(pp)
%%time
nat = nat[nat.ApprovalFY != '1976A']
nat['ApprovalFY'] = nat.ApprovalFY.astype(int)
# nat = nat[(nat.ApprovalFY >= 1990) & pd.notnull(nat.MIS_Status)]
nat = nat[pd.notnull(nat.MIS_Status)]
nat = nat[pd.notnull(nat.Name)]
# convert to timestamp
nat['ApprovalDate'] = pd.to_datetime(nat['ApprovalDate'], errors = 'coerce')
nat['DisbursementDate'] = pd.to_datetime(nat['DisbursementDate'], errors = 'coerce')
# convert $ to float
nat['DisbursementGross'] = nat['DisbursementGross'].apply(pp.to_float)
nat['BalanceGross'] = nat['BalanceGross'].apply(pp.to_float)
nat['ChgOffPrinGr'] = nat['ChgOffPrinGr'].apply(pp.to_float)
nat['GrAppv'] = nat['GrAppv'].apply(pp.to_float)
nat['SBA_Appv'] = nat['SBA_Appv'].apply(pp.to_float)
# Default
nat['default'] = nat.MIS_Status.apply(pp.default)
%%time
# cleaning data
nat['Zip5d'] = nat.apply(lambda x: pp.zip_5d(x['Zip'], x['State']), axis = 1)
nat['Zip3d'] = nat.Zip5d.str[:3]
nat['LowDoc'] = nat.LowDoc.apply(pp.clean_LowDoc)
nat['SBA_ratio'] = nat.SBA_Appv / nat.GrAppv
nat['RevLineCr'] = nat.RevLineCr.apply(pp.clean_RevLineCr)
nat['Zip_length'] = nat.Zip.apply(lambda x: len(str(x)))
nat['RealEstate'] = nat.Term.apply(pp.RealEstate)
nat['NAICS_default_rate'] = nat.NAICS.apply(pp.naics_defaut_rate)
nat['NAICS_group'] = nat.NAICS.apply(pp.naics_sector)
nat['FranchiseCode'] = nat.FranchiseCode.apply(pp.franchise)
# fix missing state
nat = pp.fix_missing_state(nat)
nat['Name2'] = nat['Name'] + '|' + nat['State'] + '(' + nat['Zip5d'].astype(str) + ')'
nat.Name2.sample(10)
nat.Name2.value_counts().head()
nat.Name.value_counts().head()
# Create a record of company applied for loan
loan_record = {}
for i in nat.Name2.unique():
loan_record[i] = []
# len(loan_record.keys())
for i in range(len(nat)):
# print i
loan_record[nat.iloc[i].Name2].append(nat.iloc[i].ApprovalFY)
loan_record_df = pd.DataFrame([loan_record]).T
loan_record_df = loan_record_df.rename(columns = {0: 'loan_list'})
loan_record_df['loan_start'] = loan_record_df.loan_list.apply(lambda x: min(x))
loan_record_df['loan_record_dict'] = loan_record_df.loan_list.apply(lambda x: pd.Series(x).value_counts().to_dict())
loan_record_df = loan_record_df.reset_index()
loan_record_df = loan_record_df.rename(columns = {'index': 'Name2'})
loan_record_df = loan_record_df.drop(loan_record_df[pd.isnull(loan_record_df.Name2)].index)
loan_record_df['Name'] = loan_record_df['Name2'].apply(lambda x: x.split('|')[0])
loan_record_df['suffix'] = loan_record_df['Name'].apply(pp.company_suffix)
display(loan_record_df.head(10))
save_csv(loan_record_df, 'extra_company_info.csv')
# Create a record of company default
nat_d = nat[nat.default == 1].reset_index(drop = True)
default_record = {}
for i in nat_d.Name2.unique():
default_record[i] = []
for i in range(len(nat_d)):
default_record[nat_d.iloc[i].Name2].append(nat_d.iloc[i].ApprovalFY)
default_record_df = pd.DataFrame([default_record]).T
default_record_df = default_record_df.rename(columns = {0: 'default_list'})
default_record_df['default_record_dict'] = default_record_df.default_list.apply(lambda x: pd.Series(x).value_counts().to_dict())
default_record_df = default_record_df.reset_index()
default_record_df = default_record_df.rename(columns = {'index': 'Name2'})
default_record_df = default_record_df.drop(default_record_df[pd.isnull(default_record_df.Name2)].index)
default_record_df['Name'] = default_record_df['Name2'].apply(lambda x: x.split('|')[0])
display(default_record_df.head(10))
save_csv(default_record_df, 'company_default_record.csv')
# Open saved loan records
from ast import literal_eval
extinf = pd.read_csv(path_SBA + 'extra_company_info.csv', sep = ';', low_memory=False)
extinf['loan_record_dict'] = extinf['loan_record_dict'].apply(literal_eval)
extinf[extinf.loan_record_dict.apply(lambda x: len(x.keys())) > 3].head()
# Open saved default records
default_record = pd.read_csv(path_SBA + 'company_default_record.csv', sep = ';', low_memory=False)
default_record['default_record_dict'] = default_record['default_record_dict'].apply(literal_eval)
default_record[default_record.default_record_dict.apply(lambda x: len(x.keys())) > 3].head()
nat = nat.merge(extinf[['Name2', 'loan_start', 'loan_record_dict', 'suffix']] , how = 'left', on = 'Name2')
nat['Loan_age'] = nat.apply(lambda x: pp.loan_age(x['ApprovalFY'], x['loan_record_dict']), axis = 1)
nat['Previous_loan'] = nat.apply(lambda x: pp.previous_loan(x['ApprovalFY'], x['loan_record_dict']), axis = 1)
nat = nat.merge(default_record[['Name2', 'default_record_dict']] , how = 'left', on = 'Name2')
nat['default_times'] = nat.apply(lambda x: pp.default_times(x['ApprovalFY'], x['default_record_dict']), axis = 1)
nat[['Loan_age', 'Previous_loan', 'default_times']].info()
nat.head()
nat.GrAppv.iplot(kind = 'hist', bins = 100, title = 'Histogram of grant approved', xTitle = 'Grant')
# No. of loan each year
nat.groupby('ApprovalFY').count().max(1).iplot(kind = 'bar', title = 'Loan cases')
(nat[nat.default == 1].groupby('ApprovalFY').count().max(1) /
nat.groupby('ApprovalFY').count().max(1)).iplot(kind = 'bar', title = 'Default rate')
nat[nat.default == 1].groupby('ApprovalFY').SBA_Appv.sum().iplot(kind = 'bar')
nat[nat.default == 1].ChgOffPrinGr.iplot(kind = 'hist', bins = 50)
nat.head().T
nat.State.value_counts().head()
# default rate
nat['default'].sum() / len(nat)
nat.default.value_counts()
LowDoc (Y = Yes, N = No): In order to process more loans efficiently, a “LowDoc Loan” program was implemented where loans under $150,000 can be processed using a one-page application. “Yes” indicates loans with a one-page application, and “No” indicates loans with more information attached to the application. In this dataset, 87.31% are coded as N (No) and 12.31% as Y (Yes) for a total of 99.62%. It is worth noting that 0.38% have other values (0, 1, A, C, R, S); these are data entry errors.
nat.LowDoc.sample(10)
NAICS (North American Industry Classification System): This is a 2- through 6-digit hierarchical classification system used by Federal statistical agencies in classifying business establishments for the collection, analysis, and presentation of statistical data describing the U.S. economy. The first two digits of the NAICS classification represent the economic sector.
table(3)
nat.NAICS.value_counts().head()
# nat.NAICS.apply(lambda x: len(str(x))).value_counts()
nat['NAICS_group'].value_counts().sort_index()
Whether a loan is backed by real estate (possession of land) is another risk indicator that is discussed. The rationale for this indicator is that the value of the land is often large enough to cover the amount of any principal outstanding, thereby reducing the probability of default.
Since the term of the loan is a function of the expected lifetime of the assets, loans backed by real estate will have terms 20 years or greater (≥240 months) and are the only loans granted for such a long term, whereas loans not backed by real estate will have terms less than 20 years (<240 months). Therefore, the authors created a dummy variable, “RealEstate,” where “RealEstate” = 1 if “Term” ≥240 months and “RealEstate” = 0 if “Term” <240 months.
nat.head()
The portion which is the percentage of the loan that is guaranteed by SBA (represented as “Portion” in the dataset) is a final risk indicator that is discussed in our courses. This is one of the variables that the authors generated calculating the ratio of the amount of the loan SBA guarantees and the gross amount approved by the bank (SBA_Appv/GrAppv)
nat.SBA_ratio.iplot(kind = 'hist', bins = 10, title = 'Histogram of SBA approved ratio', xTitle = 'SBA_ratio')
nat[nat.default == 1].SBA_ratio.iplot(kind = 'hist', bins = 10)
nat.groupby('default').SBA_ratio.mean()
nat.head().T
Revolving Line of Credit : Y = Yes
nat.RevLineCr.value_counts()
A ZIP Code is a postal code used by the United States Postal Service (USPS) in a system it introduced in 1963.
The first digit of the ZIP Code is allocated as follows:
0 = Connecticut (CT), Massachusetts (MA), Maine (ME), New Hampshire (NH), New Jersey (NJ), New York (NY, Fishers Island only), Puerto Rico (PR), Rhode Island (RI), Vermont (VT), Virgin Islands (VI), Army Post Office Europe (AE), Fleet Post Office Europe (AE) 1 = Delaware (DE), New York (NY), Pennsylvania (PA) 2 = District of Columbia (DC), Maryland (MD), North Carolina (NC), South Carolina (SC), Virginia (VA), West Virginia (WV) 3 = Alabama (AL), Florida (FL), Georgia (GA), Mississippi (MS), Tennessee (TN), Army Post Office Americas (AA), Fleet Post Office Americas (AA) 4 = Indiana (IN), Kentucky (KY), Michigan (MI), Ohio (OH) 5 = Iowa (IA), Minnesota (MN), Montana (MT), North Dakota (ND), South Dakota (SD), Wisconsin (WI) 6 = Illinois (IL), Kansas (KS), Missouri (MO), Nebraska (NE) 7 = Arkansas (AR), Louisiana (LA), Oklahoma (OK), Texas (TX) 8 = Arizona (AZ), Colorado (CO), Idaho (ID), New Mexico (NM), Nevada (NV), Utah (UT), Wyoming (WY) 9 = Alaska (AK), American Samoa (AS), California (CA), Guam (GU), Hawaii (HI), Marshall Islands (MH), Federated States of Micronesia (FM), Northern Mariana Islands (MP), Oregon (OR), Palau (PW), Washington (WA), Army Post Office Pacific (AP), Fleet Post Office Pacific (AP)
The next two digits represent the sectional center facility (SCF) (e.g. 477xx = Vanderburgh County, Indiana), and the fourth and fifth digits represent the area of the city (if in a metropolitan area), or a village/town (outside metro areas): 47722 (4=Indiana, 77=Vanderburgh County, 22=University of Evansville area). When a sectional center facility's area crosses state lines, that facility is assigned separate three-digit prefixes for the states that it serves.
Connecticut (CT) Massachusetts (MA) Maine (ME) New Hampshire (NH) New Jersey (NJ) New York (NY) Puerto Rico (PR) Rhode Island (RI) Vermont (VT) Virgin Islands (VI) Army Post Office Europe (AE) Fleet Post Office Europe (AE)
'CT','MA','ME','NH','NJ','NY','PR','RI','VT','VI','AE','AE'
nat['Zip_length'].value_counts()
Seem like some of the zip code has missing values!
nat[nat['Zip5d'] == '99999'].shape
nat['Zip5d'].value_counts().head()
nat['Zip3d'].value_counts().head()
nat = nat[['LoanNr_ChkDgt', 'Name', 'City', 'State', 'Zip5d', 'Zip3d', 'Bank', 'BankState', 'NAICS', 'NAICS_group',
'NAICS_default_rate', 'ApprovalDate', 'ApprovalFY', 'Term', 'NoEmp', 'NewExist', 'CreateJob', 'RetainedJob',
'FranchiseCode', 'UrbanRural', 'RevLineCr', 'LowDoc', 'ChgOffDate', 'DisbursementDate', 'DisbursementGross',
'BalanceGross', 'MIS_Status', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv', 'default', 'RealEstate', 'SBA_ratio']]
pd.isnull(nat).sum()
!aws s3 ls --human-readable s3://eh-home/ehda-calvin/SBA_study/
save_csv(nat, 'SBAnational_new.csv')
df = pd.DataFrame()
for f in ['FOIA - 7(a)(FY1991-FY1999).xlsx',
'FOIA - 7(a)(FY2000-FY2009).xlsx',
'FOIA - 7(a)(FY2010-Present).xlsx']:
temp = pd.read_excel(path_SBA + f)
temp = temp[(pd.notnull(temp.BorrName)) & (pd.notnull(temp.BorrState)) &
(pd.notnull(temp.BorrZip)) & (temp.Program == '7A')]
temp = temp[['BorrName', 'BorrState', 'BorrZip', 'BusinessType']]
temp['Name2'] = temp['BorrName'] + '|' + temp['BorrState'] + '(' + temp['BorrZip'].astype(str) + ')'
display(temp.head())
df = pd.concat([df, temp])
df.shape
df = df[pd.notnull(df.BusinessType)]
print df.shape
df = df.drop_duplicates('Name2', keep = 'last')
print df.shape
df.head()
save_csv(df, 'company_business_type.csv')
# converting ZIP code to FIPS code
link1 = 'https://www2.census.gov/geo/docs/maps-data/data/rel/zcta_county_rel_10.txt'
ztf = pd.read_csv(link1, usecols = ['ZCTA5','STATE','COUNTY'], dtype = str)
ztf = ztf.rename(columns = {'ZCTA5': 'zip'})
ztf['fips'] = ztf.STATE + ztf.COUNTY
ztf = ztf[['zip', 'fips']]
ztf1 = ztf.drop_duplicates('zip', keep='first')
ztf1.shape
ztf1.sample(10)
save_csv(ztf1, 'ztf1.csv')
nat = pd.read_csv(path_SBA + 'SBAnational_new.csv', low_memory=False, sep = ';',
dtype = {'Zip': str, 'Zip5d': str, 'Zip3d': str})
nat = nat.merge(ztf1, how = 'left', left_on = 'Zip5d', right_on = 'zip')
nat.sample(5)[['Zip', 'Zip3d', 'Zip5d', 'zip', 'fips']]
print pd.isnull(nat.fips).sum(), pd.notnull(nat.fips).sum()
nat.Name2.isin(df.Name2).sum()
df.head()
print nat.shape
nat = nat.merge(df[['Name2', 'BusinessType']] , how = 'left', on = 'Name2')
print nat.shape
nat.head()
nat.BusinessType.value_counts()
nat[['BusinessType']].info()
save_csv(nat, 'SBAnational_new.csv')
nat.columns.tolist()